Data Analysis

Comprehensive Data Cleaning & Exploratory Analysis of Job Market Trends

Authors
Affiliation

Connor Coulter

Boston University

Wei Wang

Boston University

Balqis Bevi Abdul Hannan Kanaga

Boston University

1 Import Data

import pandas as pd, numpy as np, os

# --- Load dataset safely ---
CSV_PATHS = ["data/lightcast_job_postings.csv", "lightcast_job_postings.csv"]
csv_path = next((p for p in CSV_PATHS if os.path.exists(p)), None)
if not csv_path:
    raise FileNotFoundError("⚠️ lightcast_job_postings.csv not found")

df = pd.read_csv(csv_path, low_memory=False)
print("✅ Loaded dataset:", df.shape)

# --- Ensure key derived columns exist ---
# Industry display
if "NAICS_2022_6_NAME" in df.columns:
    df["INDUSTRY_DISPLAY"] = df["NAICS_2022_6_NAME"]
elif "INDUSTRY" in df.columns:
    df["INDUSTRY_DISPLAY"] = df["INDUSTRY"]
else:
    df["INDUSTRY_DISPLAY"] = "Unknown"

# Salary display
salary_candidates = ["SALARY","SALARY_MEDIAN","SALARY_MID","SALARY_ANNUAL","PAY_RATE"]
found_salary = next((c for c in salary_candidates if c in df.columns), None)
if found_salary:
    df["SALARY_DISPLAY"] = pd.to_numeric(df[found_salary], errors="coerce")
    df["SALARY_DISPLAY"].fillna(df["SALARY_DISPLAY"].median(), inplace=True)
else:
    df["SALARY_DISPLAY"] = np.nan

print("Derived columns:", {
    "INDUSTRY_DISPLAY": df["INDUSTRY_DISPLAY"].notna().sum(),
    "SALARY_DISPLAY": df["SALARY_DISPLAY"].notna().sum()
})
✅ Loaded dataset: (72498, 131)
Derived columns: {'INDUSTRY_DISPLAY': np.int64(72454), 'SALARY_DISPLAY': np.int64(72498)}
/var/folders/g7/sfc5tly50013vn_cy1c842180000gn/T/ipykernel_17312/1469011101.py:26: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


print(“Derived non-null:”, { “INDUSTRY_DISPLAY”: df[“INDUSTRY_DISPLAY”].notna().sum(), “SALARY_DISPLAY”: df[“SALARY_DISPLAY”].notna().sum() })


::: {#885422d9 .cell execution_count=2}
``` {.python .cell-code}
# Data Cleaning & Preprocessing
# Drop Unnecessary Columns
columns_to_drop = [
    "ID","LAST_UPDATED_TIMESTAMP","DUPLICATES","ACTIVE_URLS","ACTIVE_SOURCES_INFO",
    "TITLE_RAW","BODY","COMPANY_RAW",
    "NAICS2","NAICS2_NAME","NAICS3","NAICS3_NAME","NAICS4","NAICS4_NAME",
    "NAICS5","NAICS5_NAME","NAICS6","NAICS6_NAME",
    "NAICS_2022_2","NAICS_2022_2_NAME","NAICS_2022_3","NAICS_2022_3_NAME",
    "NAICS_2022_4","NAICS_2022_4_NAME","NAICS_2022_5","NAICS_2022_5_NAME",
    "SOC_2","SOC_2_NAME","SOC_3","SOC_3_NAME","SOC_5","SOC_5_NAME",
    "CIP2","CIP2_NAME","CIP4","CIP4_NAME","CIP6","CIP6_NAME",
    "LOT_CAREER_AREA","LOT_CAREER_AREA_NAME","LOT_OCCUPATION","LOT_OCCUPATION_NAME",
    "LOT_SPECIALIZED_OCCUPATION","LOT_SPECIALIZED_OCCUPATION_NAME",
    "LOT_OCCUPATION_GROUP","LOT_OCCUPATION_GROUP_NAME",
    "LOT_V6_SPECIALIZED_OCCUPATION","LOT_V6_SPECIALIZED_OCCUPATION_NAME",
    "LOT_V6_OCCUPATION","LOT_V6_OCCUPATION_NAME","LOT_V6_OCCUPATION_GROUP",
    "LOT_V6_OCCUPATION_GROUP_NAME","LOT_V6_CAREER_AREA","LOT_V6_CAREER_AREA_NAME",
    "ONET","ONET_NAME","ONET_2019","ONET_2019_NAME"
]
drop_existing = [c for c in columns_to_drop if c in df.columns]
df.drop(columns=drop_existing, inplace=True)
print("Remaining columns (first 30):", list(df.columns)[:30])
Remaining columns (first 30): ['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION']

:::

# Handle Missing Values
import missingno as msno, matplotlib.pyplot as plt

msno.heatmap(df)
plt.title("Missing Values Heatmap")
plt.show()

df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

if "SALARY_DISPLAY" in df.columns:
    df["SALARY_DISPLAY"].fillna(df["SALARY_DISPLAY"].median(), inplace=True)

for col in df.select_dtypes(include="object").columns:
    df[col].fillna("Unknown", inplace=True)

# Remove Duplicates
subset_cols = [c for c in ["TITLE","COMPANY_NAME","LOCATION","POSTED"] if c in df.columns]
if subset_cols:
    before = len(df)
    df.drop_duplicates(subset=subset_cols, keep="first", inplace=True)
    print(f"Removed {before - len(df)} duplicates using {subset_cols}")

# Exploratory Data Analysis (EDA)
# Job Postings by Industry (Top 15)
import plotly.express as px

counts = (
    df["INDUSTRY_DISPLAY"]
    .value_counts(dropna=False)
    .head(15)
    .reset_index(name="Count")
    .rename(columns={"index": "Industry"})
    .sort_values("Count")
)
fig1 = px.bar(
    counts, x="Count", y="INDUSTRY_DISPLAY", orientation="h",
    title="Top 15 Industries by Number of Job Postings"
)
fig1.show()

/var/folders/g7/sfc5tly50013vn_cy1c842180000gn/T/ipykernel_17312/2976636553.py:11: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



/var/folders/g7/sfc5tly50013vn_cy1c842180000gn/T/ipykernel_17312/2976636553.py:14: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


Removed 3300 duplicates using ['TITLE', 'COMPANY_NAME', 'LOCATION', 'POSTED']
# Salary Distribution by Industry (Top 15)
sdf = df[["INDUSTRY_DISPLAY","SALARY_DISPLAY"]].copy()
sdf = sdf.dropna()
sdf = sdf[sdf["SALARY_DISPLAY"] > 0]

top_industries = sdf["INDUSTRY_DISPLAY"].value_counts().head(15).index
sdf = sdf[sdf["INDUSTRY_DISPLAY"].isin(top_industries)]

fig2 = px.box(
    sdf, x="INDUSTRY_DISPLAY", y="SALARY_DISPLAY",
    title="Salary Distribution by Industry (Top 15)",
    points=False
)
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()

# Remote vs. On-Site Jobs
if "REMOTE_TYPE_NAME" in df.columns:
    rc = df["REMOTE_TYPE_NAME"].value_counts().reset_index()
    rc.columns = ["Remote Type","Count"]
    fig3 = px.pie(
        rc, names="Remote Type", values="Count",
        title="Remote vs. On-Site Job Distribution"
    )
    fig3.show()

2 EDA: Rationale & Insights

2.1 Job Postings by Industry

Why: Highlights sectors where demand is concentrated, showing which industries are actively hiring. Key Insights: The top three industries by job postings are Temporary Help Services, Miscellaneous Ambulatory Health Care Services, and Semiconductor and Related Device Manufacturing.

2.2 Salary Distribution by Industry

Why: Shows where negotiation power exists and highlights industries paying well. Key Insights: Automotive Parts and Accessories Retailers show a wide range (negotiation potential), while Barber Shops show a narrow range (little negotiation).

2.3 Remote vs. On-Site Jobs

Why: Workplace flexibility is a major factor in today’s job market. Key Insights: Most postings (78.3%) don’t specify remote status. About 17% are remote, 3.1% hybrid, and 1.6% explicitly not remote.